import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
# Assuming 'df' is your DataFrame containing the data
# Function to extract numeric value from duration
def extract_duration(duration):
if isinstance(duration, float):
return duration
else:
try:
return int(duration.split()[0])
except ValueError:
return None
# Convert duration to numeric
df['duration_numeric'] = df['duration'].apply(extract_duration)
# Separate TV shows and movies
current_year = pd.Timestamp.now().year
last_20_years_df = df[df['release_year'] >= current_year - 20]
# Separate TV shows and movies
tv_shows_df = last_20_years_df[last_20_years_df['type'] == 'TV Show']
movies_df = last_20_years_df[last_20_years_df['type'] == 'Movie']
# Plotting TV shows
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# TV Shows Plot
tv_scatter = axs[0].scatter(tv_shows_df['release_year'], tv_shows_df['duration_numeric'])
axs[0].set_xlabel('Release Year')
axs[0].set_ylabel('Duration (Seasons)')
axs[0].set_title('TV Shows (Last 20 Years)')
# Movies Plot
movies_scatter = axs[1].scatter(movies_df['release_year'], movies_df['duration_numeric'])
axs[1].set_xlabel('Release Year')
axs[1].set_ylabel('Duration (Minutes)')
axs[1].set_title('Movies (Last 20 Years)')
plt.tight_layout()
# Add hover functionality
def hover(event):
for scatter, df_type, ax in [(tv_scatter, tv_shows_df, axs[0]), (movies_scatter, movies_df, axs[1])]:
contains, _ = scatter.contains(event)
if not contains:
continue
index = scatter.contains(event)[1]["ind"][0]
title = df_type.iloc[index]['title']
ax.annotate(f"Title: {title}",
xy=(scatter.get_offsets()[index][0], scatter.get_offsets()[index][1]),
xytext=(5, 5), textcoords='offset points',
fontsize=10, color='black',
arrowprops=dict(arrowstyle='-', color='black'))
plt.draw()
fig.canvas.mpl_connect('motion_notify_event', hover)
plt.show()
import plotly.express as px
# fig = px.scatter(df, x='release_year', y='country')
# fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release' )
# fig.show()
fig = px.scatter(df, x='release_year', y='country', title='Explore Data',
labels={'release_year': 'Release Year', 'country': 'Country release'}, hover_data={'title': False},
hover_name='title')
# Update layout
fig.update_layout(title='Explore Data', xaxis_title='Release Year', yaxis_title='Country release')
# Show plot
fig.show()
# Hover on the plot to get the options to zoom
from ipywidgets import interact, Dropdown
# Create a list of unique directors
directors = df['director'].unique()
# Set the default director
default_director = "Spike Lee"
def plot_movies_by_director(selected_director=default_director):
if selected_director is None:
return
movies = df[df['director'] == selected_director][['title', 'release_year']]
# Plotting the count of movies by director
plt.figure(figsize=(16, 8))
ax1 = plt.subplot(2, 1, 1)
movies_count = len(movies)
ax1.bar(selected_director, movies_count, color='skyblue')
ax1.set_ylabel('Number of Movies')
ax1.set_title(f"Number of Movies Directed by {selected_director}")
# Plotting the release years of movies by director
ax2 = plt.subplot(2, 1, 2)
ax2.plot(movies['title'], movies['release_year'], marker='o', color='orange', linestyle='-')
ax2.set_ylabel('Release Year')
ax2.set_xlabel('Movie Title')
ax2.set_title(f"Release Years of Movies Directed by {selected_director}")
ax2.grid(True)
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Create interactive dropdown menu
interact(plot_movies_by_director, selected_director=Dropdown(options=directors, value=default_director))
interactive(children=(Dropdown(description='selected_director', index=81, options=('Kirsten Johnson', nan, 'Ju…
<function __main__.plot_movies_by_director(selected_director='Spike Lee')>
from wordcloud import WordCloud
# Filter out NaN values and split directors by comm
# Path to a TrueType font file
font_path = "./data/Lato-Regular.ttf"
# Directors data
directors = df['director'].dropna().str.split(', ')
# Calculate the count of movies directed by each director
director_counts = directors.explode().value_counts().to_dict()
# Generate the word cloud with count in the label
text = ' '.join([f"{director} ({director_counts[director]})" for director_list in directors for director in director_list])
wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=font_path).generate(text)
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# Split directors and count the occurrences
tv_directors = df[df['type'] == 'TV Show']
movie_directors = df[df['type'] == 'Movie']
# Calculate the count of movies directed by each director for TV shows and movies
tv_director_counts = tv_directors.explode('director').groupby('director').size()
movie_director_counts = movie_directors.explode('director').groupby('director').size()
# Filter directors with at least 2 occurrences for TV Shows and 10 occurrences for Movies
filtered_tv_directors = tv_director_counts[tv_director_counts >= 2]
filtered_movie_directors = movie_director_counts[movie_director_counts >= 10]
# Create a function to get movies directed by a director
def get_movies(director):
return df[(df['director'] == director) & (df['type'] == 'Movie')]['title'].tolist()
# Create dataframes for the plots
tv_df = pd.DataFrame({'director': filtered_tv_directors.index, 'count': filtered_tv_directors.values})
tv_df['movies'] = tv_df['director'].apply(get_movies)
movie_df = pd.DataFrame({'director': filtered_movie_directors.index, 'count': filtered_movie_directors.values})
movie_df['movies'] = movie_df['director'].apply(get_movies)
# Create Plotly figures
fig_tv = px.bar(tv_df, x='director', y='count', color='count',
labels={'director': 'Director', 'count': 'Frequency'},
title='Frequency of Directors in TV Shows (>= 2 occurrences)',
hover_data={'director': False, 'count': False, 'movies': True})
fig_tv.update_xaxes(tickangle=45)
fig_movie = px.bar(movie_df, x='director', y='count', color='count',
labels={'director': 'Director', 'count': 'Frequency'},
title='Frequency of Directors in Movies (>= 10 occurrences)',
hover_data={'director': False, 'count': False, 'movies': True})
fig_movie.update_xaxes(tickangle=45)
# Show the plots
fig_tv.show()
fig_movie.show()
import matplotlib.pyplot as plt
# Calculate country counts for TV shows and movies
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movie']
tv_shows_country_counts = tv_shows['country'].value_counts(normalize=True) * 100
movies_country_counts = movies['country'].value_counts(normalize=True) * 100
# Function to plot pie chart with custom label
def plot_pie_chart(country_counts, total_count, title):
threshold = 5
other_countries = country_counts[country_counts < threshold]
major_countries = country_counts[country_counts >= threshold]
major_countries['Other'] = other_countries.sum()
labels = major_countries.index.tolist() + ['Other']
sizes = major_countries.tolist() + [other_countries.sum()]
def format_label(x):
total = sum(sizes)
percent = int(x/100.*total)
if percent == 0:
return ''
return f'{x:.1f}%\n{percent} ({x*total/100:.0f})\nTotal Movies: {total_count}'
fig, ax = plt.subplots()
wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct=format_label, startangle=90, colors=plt.cm.tab20.colors)
plt.title(title)
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle
# Add a hover label showing the count for each country
def hover(event):
for i, (label, size) in enumerate(zip(labels, sizes)):
if wedges[i].contains(event)[0]:
texts[i].set_fontweight('bold')
texts[i].set_fontsize(12)
else:
texts[i].set_fontweight('normal')
texts[i].set_fontsize(10)
plt.draw()
fig.canvas.mpl_connect('motion_notify_event', hover)
# Plotting pie charts
plot_pie_chart(tv_shows_country_counts, len(tv_shows), 'TV Shows by Country')
plot_pie_chart(movies_country_counts, len(movies), 'Movies by Country')
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
%matplotlib inline
# Assuming 'df' is your DataFrame containing the data
# Split directors and count the occurrences
tv_directors = df[df['type'] == 'TV Show']['director'].explode().value_counts()
movie_directors = df[df['type'] == 'Movie']['director'].explode().value_counts()
# Create dropdowns for selecting thresholds
tv_threshold_dropdown = widgets.Dropdown(
options=[(str(i), i) for i in range(1, max(tv_directors) + 1)],
value=2,
description='TV Shows Threshold:',
disabled=False,
)
movie_threshold_dropdown = widgets.Dropdown(
options=[(str(i), i) for i in range(1, max(movie_directors) + 1)],
value=10,
description='Movies Threshold:',
disabled=False,
)
# Function to update plots based on threshold values
def update_plots(tv_threshold, movie_threshold):
filtered_tv_directors = tv_directors[tv_directors >= tv_threshold]
filtered_movie_directors = movie_directors[movie_directors >= movie_threshold]
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
axs[0].bar(filtered_tv_directors.index, filtered_tv_directors.values, color='skyblue')
axs[0].set_xlabel('Director')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Frequency of Directors in TV Shows (>= {} occurrences)'.format(tv_threshold))
axs[0].tick_params(axis='x', rotation=45)
axs[1].bar(filtered_movie_directors.index, filtered_movie_directors.values, color='salmon')
axs[1].set_xlabel('Director')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Frequency of Directors in Movies (>= {} occurrences)'.format(movie_threshold))
axs[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Display dropdowns and plot
display(tv_threshold_dropdown, movie_threshold_dropdown)
widgets.interactive(update_plots, tv_threshold=tv_threshold_dropdown, movie_threshold=movie_threshold_dropdown)
Dropdown(description='TV Shows Threshold:', index=1, options=(('1', 1), ('2', 2), ('3', 3)), value=2)
Dropdown(description='Movies Threshold:', index=9, options=(('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), …
interactive(children=(Dropdown(description='TV Shows Threshold:', index=1, options=(('1', 1), ('2', 2), ('3', …
Reference Used: https://medium.com/codex/how-to-automatically-generate-data-structure-for-sankey-diagrams-6082e332139f
import plotly.graph_objects as go
import pandas as pd
def map_year_to_decade(year):
return str(year // 10 * 10) + 's'
# Apply the function to create the new column
df['Decade'] = df['release_year'].apply(map_year_to_decade)
def data_snakey(data, path, value_col):
sankey_data = {
'label': [],
'source': [],
'target': [],
'value': []
}
counter = 0
while (counter < len(path) - 1):
for parent in data[path[counter]].unique():
sankey_data['label'].append(str(parent)) # Convert to string
for sub in data[data[path[counter]] == parent][path[counter + 1]].unique():
sankey_data['source'].append(sankey_data['label'].index(str(parent))) # Convert to string
sankey_data['label'].append(str(sub)) # Convert to string
sankey_data['target'].append(sankey_data['label'].index(str(sub))) # Convert to string
sankey_data['value'].append(str(data[data[path[counter + 1]] == sub][value_col].sum())) # Convert to string
counter += 1
return sankey_data
con_data = data_snakey(df, [ 'type', 'rating', 'Decade'], 'release_year')
df
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = con_data['label'],
),
link = dict(
source = con_data['source'],
target = con_data['target'],
value = con_data['value']
))
])
fig.update_layout(height=700,margin={'t':0,'b':0})
Reference Used: https://github.com/alicelh/class-constrained-t-SNE
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
# Prepare data for t-SNE; for example, encode categorical data
label_encoders = {}
for column in ['type', 'rating', 'Decade']:
le = LabelEncoder()
df[column] = le.fit_transform(df[column].astype(str))
label_encoders[column] = le
# Select features for t-SNE
features = df[['type', 'rating', 'Decade']]
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(features)
# Add t-SNE results to the DataFrame
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]
# Plot using Plotly Express
fig_tsne = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color='Decade',
hover_data=['type', 'rating'], title='Netflix Content t-SNE Visualization')
fig_tsne.show()